library(AppliedPredictiveModeling)   # Data sets
library(tidyverse)                   # Oppan tidy style
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.5
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(caret)                       # Modeling
Loading required package: lattice

Attaching package: ‘caret’

The following object is masked from ‘package:purrr’:

    lift
library(e1071)                       # skewness
Error in library(e1071) : there is no package called ‘e1071’

Compute

rr data(segmentationOriginal) segmentationOriginal <- as_tibble(segmentationOriginal) segmentationOriginal

rr seg_data <- subset(segmentationOriginal, Case == ) seg_data

rr cell_id <- seg_data\(Case class <- seg_data\)Class case <- seg_data$Case seg_data <- seg_data[, -(1:3)] seg_data %>% select(-contains()) -> seg_data seg_data

Skewness

rr library(e1071) skewness(seg_data$AngleCh1)

[1] -0.02426252

rr #seg_data %>% map_dfr(skewness) summarize_all(seg_data, skewness)

Box-Cox transform

rr Ch1AreaTrans <- BoxCoxTrans(seg_data$AreaCh1) Ch1AreaTrans

Box-Cox Transformation

1009 data points used to estimate Lambda

Input data summary:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  150.0   194.0   256.0   325.1   376.0  2186.0 

Largest/Smallest: 14.6 
Sample Skewness: 3.53 

Estimated Lambda: -0.9 

Apply the transform with the predict function

rr predict(Ch1AreaTrans, head(seg_data$AreaCh1)) -> dat dat

[1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523

Or perform it all at once via caret::preProcess

rr percent_variance[1:3]

[1] 20.91236 17.01330 11.88689

Near zero variance

rr nearZeroVar(seg_data)

integer(0)

Correlations

rr correlations <- cor(seg_data) dim(correlations)

[1] 58 58

rr correlations[1:4, 1:4]

                AngleCh1      AreaCh1 AvgIntenCh1 AvgIntenCh2
AngleCh1     1.000000000 -0.002627172 -0.04300776 -0.01944681
AreaCh1     -0.002627172  1.000000000 -0.02529739 -0.15330301
AvgIntenCh1 -0.043007757 -0.025297394  1.00000000  0.52521711
AvgIntenCh2 -0.019446810 -0.153303007  0.52521711  1.00000000

Exercises

3.1

library(mlbench)
data(Glass)
str(Glass)
'data.frame':   214 obs. of  10 variables:
 $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
 $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
 $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
 $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
 $ Si  : num  71.8 72.7 73 72.6 73.1 ...
 $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
 $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
 $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
 $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

Predictor variables

map_if(Glass, .p = is.numeric, .f = BoxCoxTrans) %>% map("lambda")
$RI
[1] -2

$Na
[1] -0.1

$Mg
[1] NA

$Al
[1] 0.5

$Si
[1] 2

$K
[1] NA

$Ca
[1] -1.1

$Ba
[1] NA

$Fe
[1] NA

$Type
NULL
LS0tCnRpdGxlOiAiQ2hhcHRlciAzIC0gRGF0YSBQcmUtcHJvY2Vzc2luZyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyIHNldHVwfQpsaWJyYXJ5KEFwcGxpZWRQcmVkaWN0aXZlTW9kZWxpbmcpICAgIyBEYXRhIHNldHMKbGlicmFyeSh0aWR5dmVyc2UpICAgICAgICAgICAgICAgICAgICMgT3BwYW4gdGlkeSBzdHlsZQpsaWJyYXJ5KGNhcmV0KSAgICAgICAgICAgICAgICAgICAgICAgIyBNb2RlbGluZwpsaWJyYXJ5KGUxMDcxKSAgICAgICAgICAgICAgICAgICAgICAgIyBza2V3bmVzcwpgYGAKCiMjIENvbXB1dGUKCmBgYHtyfQpkYXRhKHNlZ21lbnRhdGlvbk9yaWdpbmFsKQpzZWdtZW50YXRpb25PcmlnaW5hbCA8LSBhc190aWJibGUoc2VnbWVudGF0aW9uT3JpZ2luYWwpCnNlZ21lbnRhdGlvbk9yaWdpbmFsCmBgYAoKYGBge3J9CnNlZ19kYXRhIDwtIHN1YnNldChzZWdtZW50YXRpb25PcmlnaW5hbCwgQ2FzZSA9PSAiVHJhaW4iKQpzZWdfZGF0YQpgYGAKCmBgYHtyfQpjZWxsX2lkIDwtIHNlZ19kYXRhJENhc2UKY2xhc3MgPC0gc2VnX2RhdGEkQ2xhc3MKY2FzZSA8LSBzZWdfZGF0YSRDYXNlCnNlZ19kYXRhIDwtIHNlZ19kYXRhWywgLSgxOjMpXQpzZWdfZGF0YSAlPiUgc2VsZWN0KC1jb250YWlucygiU3RhdHVzIikpIC0+IHNlZ19kYXRhCnNlZ19kYXRhCmBgYAoKU2tld25lc3MKCmBgYHtyfQpza2V3bmVzcyhzZWdfZGF0YSRBbmdsZUNoMSkKI3NlZ19kYXRhICU+JSBtYXBfZGZyKHNrZXduZXNzKQpzdW1tYXJpemVfYWxsKHNlZ19kYXRhLCBza2V3bmVzcykKYGBgCgpCb3gtQ294IHRyYW5zZm9ybQpgYGB7cn0KQ2gxQXJlYVRyYW5zIDwtIEJveENveFRyYW5zKHNlZ19kYXRhJEFyZWFDaDEpCkNoMUFyZWFUcmFucwpgYGAKCkFwcGx5IHRoZSB0cmFuc2Zvcm0gd2l0aCB0aGUgYHByZWRpY3RgIGZ1bmN0aW9uCgpgYGB7cn0KcHJlZGljdChDaDFBcmVhVHJhbnMsIGhlYWQoc2VnX2RhdGEkQXJlYUNoMSkpIC0+IGRhdApkYXQKYGBgCgpPciBwZXJmb3JtIGl0IGFsbCBhdCBvbmNlIHZpYSBgY2FyZXQ6OnByZVByb2Nlc3NgCgpgYGB7cn0KcGNhX29iamVjdCA8LSBwcmNvbXAoc2VnX2RhdGEsIGNlbnRlciA9IFRSVUUsIHNjYWxlID0gVFJVRSkKcGVyY2VudF92YXJpYW5jZSA8LSBwY2Ffb2JqZWN0JHNkZXZeMi9zdW0ocGNhX29iamVjdCRzZGV2XjIpKjEwMApwZXJjZW50X3ZhcmlhbmNlWzE6M10KYGBgCgpOZWFyIHplcm8gdmFyaWFuY2UKYGBge3J9Cm5lYXJaZXJvVmFyKHNlZ19kYXRhKQpgYGAKCkNvcnJlbGF0aW9ucwoKYGBge3J9CmNvcnJlbGF0aW9ucyA8LSBjb3Ioc2VnX2RhdGEpCmRpbShjb3JyZWxhdGlvbnMpCmNvcnJlbGF0aW9uc1sxOjQsIDE6NF0KYGBgCgpgYGB7cn0KY29ycnBsb3Q6OmNvcnJwbG90KGNvcnJlbGF0aW9ucywgb3JkZXIgPSAiaGNsdXN0IikgCmBgYAoKIyMgRXhlcmNpc2VzCgozLjEgCgpgYGB7cn0KbGlicmFyeShtbGJlbmNoKQpkYXRhKEdsYXNzKQpzdHIoR2xhc3MpCmBgYAoKUHJlZGljdG9yIHZhcmlhYmxlcwoKYGBge3J9CnNlbGVjdChHbGFzcywgLVR5cGUpICU+JSBjb3IoKSAlPiUgY29ycnBsb3Q6OmNvcnJwbG90KCkKYGBgCgpgYGB7cn0KY2FyZXQ6Om5lYXJaZXJvVmFyKEdsYXNzKQpzdW1tYXJpemVfaWYoR2xhc3MsIC5wID0gaXMubnVtZXJpYywgLmYgPSBlMTA3MTo6c2tld25lc3MpCm1hcF9pZihHbGFzcywgLnAgPSBpcy5udW1lcmljLCAuZiA9IEJveENveFRyYW5zKSAlPiUgbWFwKCJsYW1iZGEiKQpjYXJldDo6bmVnUHJlZFZhbHVlKEdsYXNzKQpgYGAKCg==